hm <- brm(file = "final_models/mix_final_vienna.rds", file_refit = "never")
pp_check(hm, ndraws = 10) +
coord_cartesian(xlim = c(0, 8000))
pred_vis <- function(df, model, country_selection,
var_for_offset = base$P_top10, alpha = 1, ndraws = 1000) {
scale_offset <- attributes(var_for_offset)[["scaled:center"]]
get_back <- function(df) mutate(df, P_top10 = exp(scale_offset + P_top10))
df %>%
filter(country == country_selection) %>%
modelr::data_grid(P_top10, country, field) %>%
add_predicted_draws(model, ndraws = ndraws, re_formula = NULL) %>%
get_back() %>%
ggplot(aes(P_top10, .prediction)) +
stat_interval() +
scale_color_manual(values = colorspace::lighten(clrs[4], c(.8, .67, .42))) +
scale_y_continuous(labels = dollar) +
geom_jitter(aes(y = APC_in_dollar), alpha = alpha,
position = position_jitter(width = 5, height = 50),
data = filter(df, country == country_selection) %>% get_back()) +
facet_wrap(vars(field)) +
labs(y = "Predicted vs. actual APC", x = expression(P["top 10%"]),
color = "Credible interval") +
# theme_minimal(base_family = "Hind") +
theme_clean() +
theme(legend.position = "bottom", panel.grid.minor = element_blank())
}
pred_vis(base, hm, "Austria", alpha = .5)
pred_vis(base, hm, "Brazil", alpha = .2)
This updated model fares much better for Brazil. The predictions are
still not ideal (underestimating higher APCs of ~2000), but overall much
better than the previous model.
pred_vis(base, hm, "China", alpha = .15)
pred_vis(base, hm, "United States", alpha = .2)
pred_vis(base, hm, "Turkey", alpha = .7)
summary(hm)
## Family: mixture(hurdle_lognormal, hurdle_lognormal)
## Links: mu1 = identity; sigma1 = identity; hu1 = logit; mu2 = identity; sigma2 = identity; hu2 = logit; theta1 = identity; theta2 = identity
## Formula: APC_in_dollar | weights(total_weight) ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
## hu1 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
## hu2 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
## theta1 ~ 1 + (1 | field)
## Data: base (Number of observations: 116912)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Group-Level Effects:
## ~country (Number of levels: 69)
## Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept) 0.47 0.06 0.37 0.59 1.00
## sd(mu1_P_top10) 0.05 0.04 0.00 0.16 1.00
## sd(mu2_Intercept) 0.03 0.01 0.02 0.04 1.01
## sd(mu2_P_top10) 0.01 0.00 0.00 0.02 1.00
## sd(hu1_Intercept) 1.04 0.18 0.73 1.44 1.00
## sd(hu1_P_top10) 0.17 0.11 0.01 0.41 1.00
## sd(hu2_Intercept) 1.93 0.22 1.54 2.40 1.00
## sd(hu2_P_top10) 0.26 0.13 0.02 0.53 1.00
## cor(mu1_Intercept,mu1_P_top10) -0.19 0.34 -0.81 0.51 1.00
## cor(mu2_Intercept,mu2_P_top10) -0.25 0.37 -0.82 0.57 1.00
## cor(hu1_Intercept,hu1_P_top10) -0.32 0.38 -0.87 0.57 1.00
## cor(hu2_Intercept,hu2_P_top10) -0.21 0.31 -0.77 0.43 1.00
## Bulk_ESS Tail_ESS
## sd(mu1_Intercept) 931 2059
## sd(mu1_P_top10) 622 979
## sd(mu2_Intercept) 1681 2513
## sd(mu2_P_top10) 1347 1702
## sd(hu1_Intercept) 1844 2535
## sd(hu1_P_top10) 1186 1658
## sd(hu2_Intercept) 1495 2460
## sd(hu2_P_top10) 573 960
## cor(mu1_Intercept,mu1_P_top10) 3926 2577
## cor(mu2_Intercept,mu2_P_top10) 3996 2803
## cor(hu1_Intercept,hu1_P_top10) 4009 2740
## cor(hu2_Intercept,hu2_P_top10) 3047 2402
##
## ~field (Number of levels: 19)
## Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept) 0.28 0.06 0.19 0.42 1.00
## sd(mu1_P_top10) 0.07 0.02 0.04 0.13 1.00
## sd(mu2_Intercept) 0.44 0.08 0.31 0.63 1.00
## sd(mu2_P_top10) 0.02 0.01 0.01 0.03 1.00
## sd(hu1_Intercept) 1.06 0.21 0.71 1.54 1.01
## sd(hu1_P_top10) 0.15 0.07 0.02 0.30 1.00
## sd(hu2_Intercept) 2.30 0.33 1.72 3.02 1.00
## sd(hu2_P_top10) 0.46 0.11 0.28 0.71 1.00
## sd(theta1_Intercept) 0.71 0.20 0.41 1.18 1.00
## cor(mu1_Intercept,mu1_P_top10) 0.18 0.30 -0.43 0.72 1.00
## cor(mu2_Intercept,mu2_P_top10) -0.21 0.42 -0.85 0.66 1.00
## cor(hu1_Intercept,hu1_P_top10) -0.56 0.29 -0.94 0.17 1.00
## cor(hu2_Intercept,hu2_P_top10) 0.31 0.24 -0.21 0.72 1.00
## Bulk_ESS Tail_ESS
## sd(mu1_Intercept) 1253 1855
## sd(mu1_P_top10) 2052 3224
## sd(mu2_Intercept) 1147 2067
## sd(mu2_P_top10) 1620 1944
## sd(hu1_Intercept) 1367 2317
## sd(hu1_P_top10) 1480 1124
## sd(hu2_Intercept) 1854 2309
## sd(hu2_P_top10) 2116 2724
## sd(theta1_Intercept) 1146 2105
## cor(mu1_Intercept,mu1_P_top10) 3688 2998
## cor(mu2_Intercept,mu2_P_top10) 4088 2820
## cor(hu1_Intercept,hu1_P_top10) 3661 2506
## cor(hu2_Intercept,hu2_P_top10) 2797 2487
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## mu1_Intercept 6.75 0.10 6.55 6.95 1.00 1051 1749
## hu1_Intercept -0.62 0.34 -1.31 0.07 1.01 1084 1595
## mu2_Intercept 7.38 0.10 7.17 7.59 1.00 552 962
## hu2_Intercept 0.01 0.57 -1.14 1.09 1.01 795 1218
## theta1_Intercept -0.69 0.18 -1.03 -0.32 1.00 1070 1683
## mu1_P_top10 0.17 0.03 0.11 0.24 1.00 2146 1816
## hu1_P_top10 0.02 0.09 -0.14 0.22 1.00 2426 2541
## mu2_P_top10 0.01 0.01 -0.00 0.03 1.00 2498 2451
## hu2_P_top10 -0.11 0.15 -0.41 0.18 1.00 1982 2623
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma1 0.87 0.01 0.85 0.89 1.00 8372 3006
## sigma2 0.21 0.00 0.20 0.21 1.00 8462 2979
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Here we compute marginal effects manually, by making predictions for a given x (P_top10) and then the same x * 1.01, i.e., increasing x (on the original scale) by 1%, and then comparing the predictions.
scale_offset <- attributes(base$P_top10)[["scaled:center"]]
x1_identity <- 1500
x2_identity <- x1_identity * 1.1
x1_log <- log(x1_identity) - scale_offset
x2_log <- log(x2_identity) - scale_offset
contrast <- predictions(
hm,
newdata = datagrid(P_top10 = c(x1_log, x2_log),
country = "Brazil", field = unique(base$field))) %>%
posteriordraws()
contrast_recomputed <- contrast %>%
mutate(x = factor(P_top10, labels = c("base", "step"))) %>%
pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid),
names_from = x, values_from = draw) %>%
mutate(contrast = step / base - 1)
contrast_recomputed %>%
ggplot(aes(contrast, fct_reorder(field, contrast))) +
stat_halfeye() +
scale_x_continuous(labels = percent)
This is very sensitive to the respective country. Maybe we can recompute an average marginal effect after all?
average_draws <- function(model, orig_var, q = .5,
var_for_offset = base$P_top10) {
scale_offset <- attributes(var_for_offset)[["scaled:center"]]
x1_identity <- quantile(orig_var, q)
x2_identity <- x1_identity * 1.01
x1_log <- log(x1_identity) - scale_offset
x2_log <- log(x2_identity) - scale_offset
contrast_all <- predictions(
model,
newdata = datagrid(P_top10 = c(x1_log, x2_log),
country = unique(base$country),
field = unique(base$field))) %>%
posteriordraws()
contrast_all %>%
mutate(x = factor(P_top10, labels = c("base", "step"))) %>%
pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid),
names_from = x, values_from = draw) %>%
mutate(contrast = step / base - 1)
}
summarise_by <- function(contrast_df, var = field) {
contrast_df %>%
group_by({{ var }}, drawid) %>%
summarise(effect = mean(contrast))
}
plot_effect <- function(contrast_df, location = "the median") {
contrast_df %>%
ggplot(aes(effect, fct_reorder(field, effect))) +
stat_halfeye(.width = c(.5, .9), point_interval = "median_hdi") +
scale_x_continuous(labels = percent) +
labs(
y = NULL,
x = glue::glue("% change of APC for 1% change of P_top10% at {location}"),
caption = "Averaged predictions over all countries.") +
theme_clean() +
coord_cartesian(xlim = c(-0.005, 0.005))
}
contrast_20 <- average_draws(hm, df$P_top10, q = .2)
contrast_50 <- average_draws(hm, df$P_top10, q = .5)
contrast_80 <- average_draws(hm, df$P_top10, q = .8)
contrast_20_field <- summarise_by(contrast_20, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_20_field %>%
plot_effect("the 20% quantile")
This seems reasonable, but would need to further validate.
At 50%
contrast_50_field <- summarise_by(contrast_50, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_50_field %>%
plot_effect()
contrast_80_field <- summarise_by(contrast_80, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_80_field %>%
plot_effect()
Compare all three
all_joined <- bind_rows(
rename(contrast_50_field, effect_50 = effect),
rename(contrast_80_field, effect_80 = effect),
rename(contrast_20_field, effect_20 = effect)
)
p <- all_joined %>%
pivot_longer(contains("effect"), values_to = "effect") %>%
drop_na() %>%
ggplot(aes(effect, fct_reorder(field, effect), colour = name)) +
geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
stat_pointinterval(position = position_dodge(width = .5),
.width = c(.5, .9)) +
scale_x_continuous(labels = percent) +
labs(
y = NULL,
x = glue::glue("% Change of APC for 1% change of P_top10% at given quantiles"),
caption = "Averaged predictions over all countries.") +
theme_clean() +
coord_cartesian(xlim = c(-0.005, 0.005)) +
guides(colour = guide_legend(reverse = TRUE))
# not sure which version is better
p + scale_color_manual(values = c(
effect_20 = colorspace::lighten("#007FA8", .7),
effect_50 = colorspace::lighten("#007FA8", .4),
effect_80 = "#007FA8"
))
custom_pal <- sequential_hcl(5, palette = "Mako") %>%
lighten(amount = .1)
p + scale_color_manual(values = c(
effect_20 = custom_pal[3],
effect_50 = custom_pal[3],
effect_80 = custom_pal[4]
))
Questions that arise: - why in some fields stronger/weaker effect for larger/smaller P_top10? Is this also associated with hurdle component? - Especially: why physics and mathematics negative? because of hurdle?
Need to put into context of overall averages: give average per field (from model or full set)
contrast_50 %>%
group_by(field, drawid) %>%
summarise(intercept = mean(base)) %>%
ggplot(aes(intercept, fct_reorder(field, intercept))) +
stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
point_interval = "median_hdi") +
scale_x_continuous(labels = scales::dollar) +
theme_clean() +
labs(y = NULL, x = expression(paste("Estimated APC at median of ",
P["top 10%"])))
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_20_country <- summarise_by(contrast_20, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_50_country <- summarise_by(contrast_50, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_80_country <- summarise_by(contrast_80, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
all_countries <- bind_rows(
rename(contrast_20_country, effect_20 = effect),
rename(contrast_50_country, effect_50 = effect),
rename(contrast_80_country, effect_80 = effect),
) %>%
pivot_longer(contains("effect"), values_to = "effect") %>%
drop_na()
p_country <- all_countries %>%
ggplot(aes(effect, fct_reorder(country, effect), colour = name)) +
geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
stat_pointinterval(position = position_dodge(width = .5),
.width = c(.5, .9), point_interval = "median_hdi") +
scale_x_continuous(labels = percent) +
labs(
y = NULL,
x = glue::glue("% Change of APC for 1% change of P_top10% at given quantiles"),
caption = "Averaged predictions over all countries.") +
theme_clean() +
coord_cartesian(xlim = c(-0.001, 0.005)) +
guides(colour = guide_legend(reverse = TRUE))
p_country + scale_color_manual(values = c(
effect_20 = colorspace::lighten("#007FA8", .7),
effect_50 = colorspace::lighten("#007FA8", .4),
effect_80 = "#007FA8"
))
country_identifier <- base %>%
distinct(country, region, country_code) %>%
drop_na()
all_countries_w_region <- all_countries %>%
left_join(country_identifier)
## Joining, by = "country"
p_country_region <- all_countries_w_region %>%
mutate(color_label = recode(name, effect_20 = "Effect at 20% quantile",
effect_50 = "Effect at 50% quantile",
effect_80 = "Effect at 80% quantile")) %>%
ggplot(aes(effect, fct_reorder(country, effect), colour = color_label)) +
geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
stat_pointinterval(position = position_dodge(width = .5),
.width = c(.5, .9), point_interval = "median_hdi") +
scale_x_continuous(labels = percent) +
labs(
y = NULL,
x = glue::glue("% Change of APC for 1% change of P_top10% at given quantiles"),
caption = "Averaged predictions over all fields.",
colour = NULL) +
theme_clean() +
facet_grid(rows = vars(str_wrap(region, 9)), space = "free_y", scales = "free_y") +
coord_cartesian(xlim = c(-0.001, 0.005)) +
guides(colour = guide_legend(reverse = FALSE, override.aes = list(size = 2))) +
theme(legend.position = "top")
p_country_region + scale_color_manual(values = c(
"Effect at 20% quantile" = colorspace::lighten("#007FA8", .7),
"Effect at 50% quantile" = colorspace::lighten("#007FA8", .4),
"Effect at 80% quantile" = "#007FA8"
))
Relate to gdp
gdp <- WDI::WDI(start = 2019, end = 2019)
country_effect_with_gdp <- all_countries_w_region %>%
left_join(gdp, by = c("country_code" = "iso2c"))
country_effect_with_gdp
## # A tibble: 828,000 x 9
## country.x drawid name effect region country_code country.y NY.GDP.PCAP.KD
## <chr> <fct> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Algeria 1 effec~ 1.65e-3 Middl~ DZ Algeria 4115.
## 2 Algeria 2 effec~ 3.66e-4 Middl~ DZ Algeria 4115.
## 3 Algeria 3 effec~ 9.70e-4 Middl~ DZ Algeria 4115.
## 4 Algeria 4 effec~ 1.16e-3 Middl~ DZ Algeria 4115.
## 5 Algeria 5 effec~ 2.40e-4 Middl~ DZ Algeria 4115.
## 6 Algeria 6 effec~ 3.13e-3 Middl~ DZ Algeria 4115.
## 7 Algeria 7 effec~ -2.50e-4 Middl~ DZ Algeria 4115.
## 8 Algeria 8 effec~ 3.01e-3 Middl~ DZ Algeria 4115.
## 9 Algeria 9 effec~ 9.91e-4 Middl~ DZ Algeria 4115.
## 10 Algeria 10 effec~ 1.33e-3 Middl~ DZ Algeria 4115.
## # ... with 827,990 more rows, and 1 more variable: year <int>
p <- country_effect_with_gdp %>%
filter(name == "effect_50") %>%
rename(gdp = NY.GDP.PCAP.KD) %>%
group_by(country_code, country.x, gdp, region) %>%
point_interval(effect, .width = .5, .point = median, .interval = hdi) %>%
ggplot(aes(gdp, effect, colour = region, label = country.x)) +
geom_pointrange(aes(ymin = .lower, ymax = .upper)) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::comma) +
scale_color_discrete_qualitative(palette = "Dark 3") +
labs(colour = NULL, x = "GDP per capita",
y = "% increase in APC for 1% increase of P_top10 at the median") +
theme_clean() +
theme(legend.position = "top") +
coord_cartesian(ylim = c(0, .003))
p
## Warning: Removed 1 rows containing missing values (geom_pointrange).
plotly::ggplotly(p)
This could still be improved by adding number of universities (or papers) as a size variable.
unis_p_country <- df %>%
distinct(country, University) %>%
count(country, name = "n_universities")
pdata <- country_effect_with_gdp %>%
filter(name == "effect_50") %>%
rename(gdp = NY.GDP.PCAP.KD) %>%
group_by(country_code, country.x, gdp, region) %>%
point_interval(effect, .width = .5, .point = median, .interval = hdi) %>%
left_join(unis_p_country, by = c("country.x" = "country"))
labels <- pdata %>%
mutate(label = case_when(
country.x %in% c("China", "India", "Iran", "Germany", "United States",
"Brazil", "Luxembourg", "Czech Republic") ~ country.x,
TRUE ~ ""))
pdata %>%
ggplot(aes(gdp, effect, colour = region, label = "")) +
geom_linerange(aes(ymin = .lower, ymax = .upper, alpha = n_universities)) +
geom_point(aes(alpha = n_universities), size = 2) +
ggrepel::geom_text_repel(data = labels, aes(label = label),
show.legend = FALSE, max.overlaps = Inf,
box.padding = 1, min.segment.length = 0) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(labels = scales::comma) +
scale_color_discrete_qualitative(palette = "Dark 3") +
# scale_size_continuous(trans = "sqrt") +
scale_alpha_continuous(range = c(.2, 1), trans = "log10") +
labs(colour = "Region", x = "GDP per capita", alpha = "Number of universities",
y = "% increase in APC for 1% increase of P_top10 at the median") +
theme_clean() +
theme(legend.position = "top", legend.box = "vertical") +
coord_cartesian(ylim = c(0, .003))
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
contrast_50 %>%
group_by(country, drawid) %>%
summarise(intercept = mean(base)) %>%
ggplot(aes(intercept, fct_reorder(country, intercept))) +
stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
point_interval = "median_hdi") +
scale_x_continuous(labels = scales::dollar) +
theme_clean() +
labs(y = NULL, x = expression(paste("Estimated APC at median of ",
P["top 10%"])))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
Not sure whether this is helpful. What we show is quite abstract (APC at
hypothetical level of P_top10, averaged across all fields (weighting
equally).
The same would apply to the field intercepts above.